#message=FALSE means some output messages aren't printed in the knitted HTML document
knitr::opts_chunk$set(echo = FALSE, warning=FALSE, message=FALSE)
#message=FALSE, echo=FALSE and warning=FALSE ensure any error messages aren't shown in the output HTML file
#message=FALSE means no output messages are printed on the knitted HMTL file
#loading any necessary packages
library(dplyr)
library(tidyverse)
library(knitr)
library(readr)
library(ggplot2)
library(bbmle)
library(mizer)
library(Hmisc)
library(lme4)
library(RColorBrewer)
library(viridis)
library(hexbin)
library(ggpattern)
load("stomach_dataset.Rdata")
#loading the dataset to be used in analysis (the dataset is already named stom_df)
df <- stom_df %>%
transmute(haul_id, ices_rectangle, year, pred_species,
pred_weight_g, pred_length_cm, prey_weight_g,
prey_type = prey_funcgrp, indiv_prey_weight = prey_ind_weight_g,
prey_count, n_stomachs,
no._prey_per_stmch = prey_count/n_stomachs, ppmr)
#creating a new data set called 'df' which only contains certain selected columns
df <- df[df$indiv_prey_weight != 0, ]
df <- df[df$pred_weight_g != 0, ]
df <- df[df$indiv_prey_weight != Inf, ]
#Removes any points where the prey weight = Inf or 0, or the predator weight = 0
df<- df[!(is.na(df$year)),]
df<- df[!(is.na(df$ices_rectangle)),]
#Removes any points where the value is NA
renamed_df = df %>%
mutate(pred_species = replace(pred_species,
pred_species == "Micromesistius poutassou", "Blue Whiting")) %>%
mutate(pred_species = replace(pred_species, pred_species == "Gadus morhua", "Cod")) %>%
mutate(pred_species = replace(pred_species, pred_species == "Limanda limanda", "Common Dab")) %>%
mutate(pred_species = replace(pred_species,
pred_species == "Merluccius merluccius", "European Hake")) %>%
mutate(pred_species = replace(pred_species, pred_species == "Melanogrammus aeglefinus", "Haddock")) %>%
mutate(pred_species = replace(pred_species, pred_species == "Clupea harengus", "Herring")) %>%
mutate(pred_species = replace(pred_species, pred_species == "Trachurus trachurus", "Horse Mackerel")) %>%
mutate(pred_species = replace(pred_species, pred_species == "Scomber scombrus", "Mackerel")) %>%
mutate(pred_species = replace(pred_species, pred_species == "Lepidorhombus whiffiagonis", "Megrim")) %>%
mutate(pred_species = replace(pred_species, pred_species == "Lophius piscatorius", "Monkfish")) %>%
mutate(pred_species = replace(pred_species, pred_species == "Trisopterus esmarkii", "Norway Pout")) %>%
mutate(pred_species = replace(pred_species, pred_species == "Pleuronectes platessa", "Plaice")) %>%
mutate(pred_species = replace(pred_species, pred_species == "Trisopterus minutus", "Poor Cod")) %>%
mutate(pred_species = replace(pred_species, pred_species == "Solea solea", "Sole")) %>%
mutate(pred_species = replace(pred_species, pred_species == "Sprattus sprattus", "Sprat")) %>%
mutate(pred_species = replace(pred_species, pred_species == "Merlangius merlangus", "Whiting"))
#Renaming the predator species from their latin names (e.g. Capros apers) to their more common names (e.g. Boarfish)
#Creates a new data frame (called 'renamed_df') with these replaced names
species_list <- c("Blue Whiting", "Cod", "Common Dab", "European Hake", "Haddock", "Herring",
"Horse Mackerel", "Mackerel", "Megrim", "Monkfish", "Norway Pout",
"Plaice", "Poor Cod", "Sole", "Sprat", "Whiting")
#Creates an array called 'species_list', which is list of the predator species we are focusing on in this project
renamed_df <- renamed_df[renamed_df$pred_species %in% species_list, ]
#Removes any observations of predator species not in the species_list, i.e. they are irrelevant data points for this project
renamed_df$lppmr <- log(renamed_df$ppmr)
renamed_df$lprey_weight <- log(renamed_df$indiv_prey_weight)
renamed_df$lpred_weight <- log(renamed_df$pred_weight_g)
#Adds columns which take the log value of ppmr, individual prey weight and predator weight to the main data set
This data set is formed from recordings taken by multiple ships between the years 1886-2016.
Fish were taken from some location, and their individual stomach contents recorded. Predators of the same species and (roughly) the same weight/size were recorded as a single data point (the number of predators for each data set is called ‘n_stomachs’). Prey of (roughly) the same size found in these stomachs were recorded in this same data point, and the total number of prey in some data point is called ‘prey_count’. For each individual data point,
\[ \text{no. prey per stomach} = \frac{\text{no. of stomachs sampled}}{\text{total no. of prey}} \] This is an approximation of the number of prey per stomach for some specific predator, and is called ’ no._prey_per_stmch’.
Using similar logic,‘prey_weight_g’ is the total weight of prey in the multiple stomachs sampled to make up one datapoint. ‘indiv_prey_weight’ is then the weight of one single prey individual found in the stomach
There are some other column names which are useful to note:
ggplot(renamed_df, aes(x=prey_type, fill=prey_type)) +
geom_bar_pattern(stat = "count",
pattern_color = "white",
pattern_fill = "black",
aes(pattern = prey_type, pattern_angle = prey_type)) +
facet_wrap(~renamed_df$pred_species, scale="free_y") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Distribution of prey types for each predator species", x="Prey type",
y="Number of Observations")
#geom_bar_pattern() creates a bar chart from the specified data, where each category has a different pattern used to fill it
#Using different patterns as well as different colours to fill the bars ensures the data can be understood if looked at in grey scale, and makes the document accessible for colour blind people
#facet_wrap() means an individual bar chart is created for each individual predator species
#'theme()' adjusts the labels along the x-axis so they lie slightly angled to the vertical direction so they are easy to read
#scale_fill_brewer changes the colour palette of the plot so that the colours are easy to distinguish between, and renames the title of the key to show which colour means what
These are individual bar charts showing the distribution of the type of prey each predator eats.
The prey types are:
We can see that some predators eat a range of prey types (e.g. Whiting), while others prefer to eat a single prey type in abundance (e.g. European Hake and Monkfish mostly consume fish as their prey).
ggplot(data = renamed_df, aes(lprey_weight, no._prey_per_stmch)) +
labs(title = "Number of prey per predator stomach v. log(prey weight) ",
x="log(prey weight)", y="Number of prey per predator stomach") +
geom_point(size=0.5)
#geom_point() adds individual points which show individual recorded points
#size=0.5 defines the size of each point on this graph
This graph is looking at the distribution of the weight of prey recorded, i.e. looking at what is the most common prey weight over all prey species. It is a graph over all the data points, so includes recordings from all the ships involved.
There are some potentially interesting results, so results from different ships were plotted on individual curves to further look into these results.
renamed_df$haul_id_short <- gsub("\\-.*", "", renamed_df$haul_id)
renamed_df$haul_id_short <- gsub("_", "", renamed_df$haul_id_short)
#the haul_id values are renamed to be shortened versions of the ship names (e.g. CLYDE) rather than the complete id (e.g. CLYDE-1935-6)
ggplot (data = renamed_df, aes(x=lprey_weight, y=no._prey_per_stmch)) +
labs(title = "Number of prey per stomach v. log(prey weight) - separated by all ship names",
x="log(prey weight)", y="Number of prey per stomach") +
geom_point(size=0.02, colour="red") +
theme(strip.text = element_text(size = 5)) +
facet_wrap(~renamed_df$haul_id_short, scale="free_y")
Taking a closer look at some of these ships gives the plots below:
interesting_haul <- filter(renamed_df,
haul_id_short=='CLYDE'|haul_id_short=='END04'|haul_id_short=='LUC'|
haul_id_short=='HIDDINK'|haul_id_short=='EXCmacDATSTO815'|
haul_id_short=="Excmacdatsto815error")
#interesting_haul is a new data frame which only contains values recorded by ships which gave "interesting" looking datapoints
haul_list_interesting <- unique(interesting_haul$haul_id_short)
#creates an array containing every individual (renamed) ship name that we are interested in
ggplot (data = interesting_haul, aes(x=lprey_weight, y=no._prey_per_stmch)) +
labs(title = "Number of prey per stomach v. log(prey weight)
- separated by ship names, for a selection of 6 ships",
x="log(prey weight)", y="Number of prey per stomach") +
geom_point(size=0.02, colour="red") +
theme(strip.text = element_text(size = 10)) +
facet_wrap(~interesting_haul$haul_id_short, scale="free_y")
These six graphs show a range of unusual looking points:
There are a number of other ships which also provided potentially erroneous data, and the number of datapoints affected are as below:
type1 <- length((filter(renamed_df, haul_id_short=='END04'))$haul_id_short)
#counting the number of observations recorded for each sort of error or interesting relation
type2 <- length((filter(renamed_df, haul_id_short=='CLION09' | haul_id_short=='LUC' | haul_id_short=='UNITY' | haul_id_short=='CLYDE'))$haul_id_short)
type3 <- length((filter(renamed_df, haul_id_short=='BEAVER'|haul_id_short=='BRUCELLA'|haul_id_short=='BULLEN'|haul_id_short=='CIROL04'|haul_id_short=='CIROL10'|haul_id_short=='CLION03'|haul_id_short=='CLION07'|haul_id_short=='CLION12'|haul_id_short=='COREL02'|haul_id_short=='COREL06'|haul_id_short=='CORY04'|haul_id_short=='CYPRIS'|haul_id_short=='END02'|haul_id_short=='END12'|haul_id_short=='HEINCKE147'|haul_id_short=='HIDDINK'|haul_id_short=='OITHONA'|haul_id_short=='STRANDLINE'|haul_id_short=='TELLINA'))$haul_id_short)
type4 <- length((filter(renamed_df, haul_id_short=='EXCmacDATSTO815'|haul_id_short=='Excmacdatsto815error'))$haul_id_short)
ship_erroneous <- data.frame("Graph_Type"=c("y proportional to exp(-x)", "Single prey weight"
, "Same number of prey per stomach", "Same data"),
"Number_of_observations"=c(type1, type2, type3, type4))
#creates a data frame which shows the number of observations of each error 'type'
kable(ship_erroneous)
| Graph_Type | Number_of_observations |
|---|---|
| y proportional to exp(-x) | 2093 |
| Single prey weight | 554 |
| Same number of prey per stomach | 26854 |
| Same data | 5618 |
#creates a table of the ship_erroneous data frame as an output
In total, 35445 out of the total 267431 observations lie within these potentially erroneous groups. This is: \[ \frac{35445}{267431} \times 100 = 13.25389 \% \]
This means that approximately \(13 \%\) of the data set comes from potentially unreliable sources. Though we will not remove or alter the data set to account for these potentially erroneous data points in this project, for further analysis it may be sensible to remove the data points these six ships recorded so that any results are as reliable as possible.
We are attempting to find a link between the predator weight and the prey weight. Log() of each axis is used to see the proportionality of the axes, where the slope of the added line should = 1. This is because:
\[ \log (\text{prey weight}) = m \times \text{log(predator weight)} + c , \]
where m is the gradient of the slope and c is the y-intercept (thinking of this graph as a linear model).
Taking the exponential of both sides,
\[ \text{prey weight} = \exp({{m}\times{\log(\text{predator weight})}}) + \log(D) , \] where log(D) = c.Â
Finally,
\[ \text{prey weight} = \text{predator weight}^m \times D \]
Hence, we want m=1 to show that log(predator weight) is proportional to log(prey weight) (as expected).
#message=FALSE means some output messages aren't printed in the knitted HTML document
ggplot(data = renamed_df, aes(lpred_weight, lprey_weight)) +
labs(title = "log(prey weight) v. log(predator weight) plot",
x="log(predator weight)", y="log(prey weight)") +
geom_hex(bins = 50) +
scale_fill_gradient2(low = "black", mid = "orange", high = "red", midpoint = 200) +
stat_smooth(method='lm', se=FALSE, colour="blue") +
theme_classic()
#theme_classic() removes the grey background of the plot to make it easier to read and understand
# stat_smooth adds a line of best fit plotted through the points
# method='lm' ensures it is a straight line, i.e. a linear model
# se=FAlSE creates only a single line, with no error of margin included
#geom_hex creates "bins" in the data where the density is calculated over
#bins=50 means that there are 50 "bins" in the horizontal direction and 50 in the vertical
#scale_fill_gradient2 creates a density colour scale, meaning that areas with a high density of points are red and areas with a low density of points are darker in colour
slope <- coef(lm(renamed_df$lprey_weight~renamed_df$lpred_weight))
cat("Slope of the log(prey weight) v. log(predator weight) line of best fit:", slope[2])
## Slope of the log(prey weight) v. log(predator weight) line of best fit: 0.6712536
#'cat()' prints some output message as purely the characters included in the '()' section
# coef(lm()) creates an array containing the coefficients of the linear model of the relationship between two axes
# The first item of the array gives out the y-intercept of the line, and the second item of the array is the gradient
For this graph, the slope is not equal to one. However, this graph is plotted over the entire data set and includes values from all of the available predator species.
Instead, separate the plots by predator species to see if there is some proportionality between the predator and prey weight for each specific species.
ggplot(data = renamed_df, aes(lpred_weight, lprey_weight)) +
labs(title = "log(prey weight) v. log(predator weight), separated by predator species",
x="log(predator weight)", y="log(prey weight)") +
geom_point(size=0.01, colour="red") +
facet_wrap(~pred_species, scale="free_y") +
theme(strip.text = element_text(size = 10)) +
stat_smooth (method='lm', se=FALSE, colour="black")
i <- 1
species_grad = c()
#setting i=1 for the while loop
#and creating an empty vector called 'species_grad' to hold data
while(i<=length(species_list)){
species_df <- renamed_df %>% filter(pred_species == fixed(species_list[i]))
grad <- coef(lm((species_df$lprey_weight)~(species_df$lpred_weight)))
species_grad[i] <- grad[2]
i=i+1
}
#'while' means that this function repeats through the length of 'species_list' until all predator species are accounted for
#i=i+1 increases the value of i each time through this loop to motivate this repeat
#this creates a data frame (species_df) containing each indivdual predator species, then calculates the gradient of the log(pred weight) v. log(prey weight) graph for this individual predator species, then inserts this gradient value to the ith value of an array named 'species_grad'
pvp_grads <- data.frame("Predator_Species"=c(species_list),
Gradient=c(species_grad))
kable(pvp_grads)
| Predator_Species | Gradient |
|---|---|
| Blue Whiting | -0.2433567 |
| Cod | 0.7269038 |
| Common Dab | 0.6359695 |
| European Hake | 0.7677213 |
| Haddock | 0.7004115 |
| Herring | 0.1507461 |
| Horse Mackerel | 0.5960227 |
| Mackerel | 0.2379693 |
| Megrim | 1.1524540 |
| Monkfish | 0.8168852 |
| Norway Pout | 0.9604312 |
| Plaice | 0.7014262 |
| Poor Cod | 1.0845397 |
| Sole | 0.3803803 |
| Sprat | 0.5874927 |
| Whiting | 0.6945304 |
#creates a new data frame name 'pvp_grads', with one column named 'pred_species' and the other named 'gradient'
#each row contains the name of some individual predator species and its associated gradient
#kable prints the information in the pvp_grads
Here, we are splitting up the graphs over each individual predator species to look and see if there is a proportionality between the predator and prey masses for each specific predator species.
We are wanting gradient\(=1\) to suggest proportionality between the two variables. This is true (\(\pm 0.5\)) for the species:
This represents 13 out of the 16 possible predator species, hence the assumption that the predator and prey masses are proportional is a reasonable one. Therefore, we will continue with this assumption.
Having proportionality between the predator and prey masses allows us to use the PPMR (predator prey mass ratio) for further analysis in this project.
Here, we are looking for the most common log(PPMR) for each individual species. This will be done by plotting log(PPMR) against the density of points, and weighting observations on different variables. By assuming these are normally distributed relations, there should be a ‘peak’ point of log(ppmr) which is where the mean/most common log(ppmr) value lies. We assume this will be different for different species of predator, i.e. each predator has a preferred ppmr value and hence each predator type has a preferred relative size of prey.
These plots are weighted by the prey biomass. This means that we are looking at the distribution of prey biomass across values of log(PPMR), so points with a larger biomass are prioritised/weighted more than those with a smaller biomass.
ggplot(data = renamed_df, aes(x=lppmr)) +
labs(title = "Density plot of log(PPMR), weighted by biomass density of prey",
x="log(PPMR)", y="Biomass density of prey") +
facet_wrap(~renamed_df$pred_species, scale="free_y") +
theme(strip.text = element_text(size = 5)) +
geom_density(aes(weight = prey_weight_g), colour="red")
#using facet_wrap for the variable (pred_species) means the data is separated into individual plots for each predator species
#geom_density means area under the curve = 1 (i.e. the graph is normalised)
#weight=prey_weight_g means the points are 'weighted' by the column weight (biomass) of each individual prey
These plots are weighted by the number pf prey per stomach. This means that data points with a larger number of prey per stomach are prioritised/weighted more than those with a smaller number of prey per stomach.
ggplot(data = renamed_df, aes(x=lppmr)) +
labs(title = "Density plot of log(PPMR), weighted by number density of prey",
x="log(PPMR)", y="Number density of prey") +
facet_wrap(~renamed_df$pred_species, scale="free_y") +
theme(strip.text = element_text(size = 5)) +
geom_density(aes(weight = no._prey_per_stmch), colour="green")
#weight=no._prey_per_stmch means the points are 'weighted' by the number of prey per stomach
By looking at the two differently weighted graphs, it is clear to see that the plots are ‘shifted’ by some amount (e.g. the Blue Whiting when weighted by prey biomass has a mean log(ppmr) of roughly 4.8, and when weighted by number of prey this mean becomes roughly 7.5).
Here, we take only observations relating to the predator type Herring. This allows us to do more specific analysis about how density observations can be weighted, and explain why the weighted means are shifted by some amount.
herringDF <- renamed_df %>%
filter(pred_species == fixed("Herring"))
#creates a separate data set (called 'herringDF') only containing observations for the predator species 'Herring'
The curve with points weighted by the prey biomass is plotted in blue, and a normal curve (also weighted by prey biomasss) is plotted over the top as a dashed black line.
bio_herringmean = weighted.mean(herringDF$lppmr, w = herringDF$prey_weight_g, na.rm = TRUE)
bio_herringSD = sqrt(wtd.var(herringDF$lppmr, w = herringDF$prey_weight_g, na.rm = TRUE))
#weighted.mean gives the arithmetic mean of log(ppmr), where datapoints are weighted by the prey weight of observations
#similarly, wtd.rvar is the variance of log(ppmr), where the datapoints are also weighted by the prey weight
#standard deviation is defined as the square root of the variance
#na.rm=TRUE means any rows with missing values (values that equal 'na') aren't included in the mean/variance calculations, but are instead ignored
ggplot(data = herringDF, aes(x=lppmr)) +
labs(title = "Density plot of log(ppmr) for Herring,
weighted by prey biomass",
x="log(PPMR)",y="Biomass density of observations") +
geom_density(aes(weight = prey_weight_g), colour="red") +
theme(plot.title = element_text(size=15), axis.text.y = element_text(size=10)) +
stat_function(fun = dnorm,
linetype="dashed",
args= with(herringDF, c(mean = bio_herringmean, sd = bio_herringSD))) +
xlim(-5,17)
#axis.text.y adjusts the size of the axis labels on the y-axis so they are most easily readable
#stat_function adds a normal distribution curve (fun=dnorm) with a mean and standard deviation equal to what was calculated earlier for this data set
#linetype makes the line for the normal distribution into a dotted (black) line
#xlim sets limits for the x-axis so that all the necessary data points can be seen
cat("Mean value of log(ppmr), weighted by biomass of prey:", bio_herringmean)
## Mean value of log(ppmr), weighted by biomass of prey: 6.781942
cat("\n")
#creates a line break so that the outputs appear on different lines in the knitted html document
cat("Standard deviation of this:", bio_herringSD)
## Standard deviation of this: 2.370106
The curve with points weighted by the number of prey per stomach is plotted in red, and a normal curve (also weighted by the number of prey per stomach) is plotted over the top as a dotted black line.
no_herringmean = weighted.mean(herringDF$lppmr, w = herringDF$no._prey_per_stmch, na.rm = TRUE)
no_herringSD = sqrt(wtd.var(herringDF$lppmr, w = herringDF$no._prey_per_stmch, na.rm = TRUE))
#the mean and variance are both weighted by the number of prey
ggplot(data = herringDF, aes(x=lppmr), group=1) +
labs(title = "Density plot of log(ppmr) for Herring,
weighted by number of prey per stomach",
x="log(PPMR)", y="Number density of observations") +
geom_density(aes(weight = no._prey_per_stmch), colour="green") +
theme(plot.title = element_text(size=15), axis.text.y = element_text(size=10)) +
stat_function(fun = dnorm,
linetype = "dotted",
args= with(herringDF, c(mean = no_herringmean, sd = no_herringSD))) +
xlim(0,25)
#the xlim is different to the graph above because these data points lie over a slightly different log(ppmr) range
cat("Mean value of log(ppmr), weighted by the number of prey per stomach:", no_herringmean)
## Mean value of log(ppmr), weighted by the number of prey per stomach: 13.68691
cat("\n")
cat("Standard deviation of this:", no_herringSD)
## Standard deviation of this: 2.473435
ggplot(data = herringDF, aes(x=lppmr), group=1) +
labs(title = "Density plot of log(ppmr) for Herring,
with various weightings",
x="log(PPMR)", y="Number/biomass density of observations",
color="Line", linetype="Line") +
geom_density(aes(weight = no._prey_per_stmch, colour="prey biomass",
linetype="prey biomass")) +
geom_density(aes(weight = prey_weight_g, colour="number of prey per stomach",
linetype="number of prey per stomach")) +
stat_function(fun = dnorm,
args= with(herringDF, c(mean = no_herringmean, sd = no_herringSD)),
aes(colour="number of prey per stomach - normal curve",
linetype="number of prey per stomach - normal curve")) +
stat_function(fun = dnorm,
args= with(herringDF, c(mean = bio_herringmean, sd = bio_herringSD)),
aes(colour="prey biomass - normal curve",
linetype="prey biomass - normal curve")) +
theme(plot.title = element_text(size=20)) +
scale_color_manual(values=c('green', 'black', 'red', 'black')) +
scale_linetype_manual(values = c("solid", "dashed", "solid", "dotted")) +
xlim(-5,25)
#scale_color_manual manually adds a key to the graph to describe what the differently coloured lines represent
#scale_linetype_manual manually adds a key to the graph to describe what the differently shaped lines represent (e.g. some lines are dotted, dashed or solid)
#each curve is given a name using 'aes(colour="")', and these are then given a specific colour using 'values='
This is a graph with both the the distribution of Herring log(PPMR) as weighted by prey biomass and number of prey plotted over each other (along with appropriately weighted normal distribution curves plotted over the top of each).
and
The mean is shifted by: \(13.54102 - 6.629177 = 6.911843\).
The mathematics of shifted means for this difference in weighting is:
\[ \text{weighted mean}_{\text{expected prey biomass}} = \text{weighted mean}_{\text{no. of prey }} - (\text{standard deviation}_{\text{no. of prey }})^2 \]
Hence, the expected result is:
\[ \text{weighted mean}_{\text{actual prey biomass}} = 13.54102 - (2.63330496940788)^2 = 6.60672493809 \]
There is a difference in expected and actual prey biomass weighted mean of 0.30511806191. This is a fairly insignificant value, which suggests the shifting mean equations are relatively accurate for this data set with these weightings.
Therefore, we can continue with the assumption that there is a difference in the mean of log(ppmr) of the square of the standard deviation when we weight by number of prey per stomach and prey biomass.
ggplot(data=renamed_df, aes(lpred_weight, lppmr)) +
geom_point(size=0.001, colour="cornflowerblue") +
labs(title = "log(PPMR) v. log(predator mass) plot",
x="log(Predator mass)", y="log(PPMR)") +
stat_smooth (method='lm', se=FALSE, colour="black") +
facet_wrap(~pred_species, scale="free_y")
Graphing \(\log\)(predator mass) v. \(\log\)(PPMR) for individual predator species to see if the predator mass related to the ppmr.
\[ \log(\text{PPMR}) = m \times \log (\text{predator mass}) + c \]
where m is the gradient (assuming a linear model between the variables) and c is the y-intercept.
We want them to not be proportional (i.e. slope, m= 0) to prove that log(predator mass) is not related to the log(ppmr). This would mean that the ppmr is independent of the predator mass for a certain species (as assumed).
j <- 1
species_grad_ppmr = c()
while(j<=length(species_list)){
species_df <- renamed_df %>% filter(pred_species == fixed(species_list[j]))
grad <- coef(lm((species_df$lppmr)~(species_df$lpred_weight)))
species_grad_ppmr[j] <- grad[2]
j=j+1
}
ppmrvp_grads <- data.frame("Predator_species"=c(species_list),
gradient=c(species_grad_ppmr))
kable(ppmrvp_grads)
| Predator_species | gradient |
|---|---|
| Blue Whiting | 1.2433567 |
| Cod | 0.2730962 |
| Common Dab | 0.3640305 |
| European Hake | 0.2322787 |
| Haddock | 0.2995885 |
| Herring | 0.8492539 |
| Horse Mackerel | 0.4039773 |
| Mackerel | 0.7620307 |
| Megrim | -0.1524540 |
| Monkfish | 0.1831148 |
| Norway Pout | 0.0395688 |
| Plaice | 0.2985738 |
| Poor Cod | -0.0845397 |
| Sole | 0.6196197 |
| Sprat | 0.4125073 |
| Whiting | 0.3054696 |
For the assumption to be upheld, we need gradient \(= 0\). This is (approximately, by \(\pm 0.5\)) satisfied for the species:
This means that the gradient of 12 out of the possible 16 species is equal to \(0 \pm 0.5\). However, this amount of error is fairly large, so we have not supported the idea that the assumption of independence between the \(log\text{ppmr}\) and \(\log(\text{predator mass})\) can be upheld for this data set.
species_df <- renamed_df %>% filter(pred_species == fixed("Poor Cod"))
#creating a data frame only containing observations where the predator species is poor cod.
ggplot(data=species_df, aes(lpred_weight, lppmr)) +
geom_point(size=0.5) +
labs(title = "log(PPMR) v. log(predator mass) plot: Poor Cod",
x="log(Pred mass)", y="log(PPMR)", color="Weightings", linetype="Weightings") +
stat_smooth(aes(weight=no._prey_per_stmch, color='by number of prey in stomach',
linetype='by number of prey in stomach'),
method='lm', se=FALSE) +
stat_smooth(aes(weight=prey_weight_g, color='by prey biomass', linetype='by prey biomass'),
method='lm', se=FALSE) +
stat_smooth(aes(color='no weighting', linetype='no weighting'), method='lm', se=FALSE) +
stat_summary(fun.data=mean_se, geom="linerange") +
scale_color_manual(values=c("green", "red", "blue")) +
scale_linetype_manual(values = c("dashed", "dotted", "solid")) +
theme_classic()
# se=FALSE doesn't add an area of error around the LOBF
# fun.data=mean_se calculates the mean and standard error for each point
# linerange draws a point range between an upper and lower limit for the line, and the mean for the point (using the values calculated in 'fun.data=mean_se')
cat("slope of the log(ppmr) v. log(predator mass) line of best fit for poor cod:")
## slope of the log(ppmr) v. log(predator mass) line of best fit for poor cod:
cat("\n")
#weighted by number of prey per stomach
perstmch_weighting <-
coef(lm(species_df$lppmr~species_df$lpred_weight, weight=species_df$no._prey_per_stmch))
cat("i) Weighted by no prey in the stomach:", perstmch_weighting[2])
## i) Weighted by no prey in the stomach: -1.652398
cat("\n")
#weighted by prey biomass
biomass_weighting <-
coef(lm(species_df$lppmr~species_df$lpred_weight, weight=species_df$prey_weight_g))
cat("ii) Weighted by prey biomass:", biomass_weighting[2])
## ii) Weighted by prey biomass: -0.001483607
cat("\n")
#no weighting
no_weighting <- coef(lm(species_df$lppmr~species_df$lpred_weight))
cat("iii) No weighting in the calculation:", no_weighting[2])
## iii) No weighting in the calculation: -0.08453972
cat("slope of the log(ppmr) v. log(predator mass) line of best fit for poor cod:",
"\n",
"i) Weighted by no prey in the stomach:", perstmch_weighting[2],
"\n",
"ii) Weighted by prey biomass:", biomass_weighting[2],
"\n",
"iii) No weighting in the calculation:", no_weighting[2])
## slope of the log(ppmr) v. log(predator mass) line of best fit for poor cod:
## i) Weighted by no prey in the stomach: -1.652398
## ii) Weighted by prey biomass: -0.001483607
## iii) No weighting in the calculation: -0.08453972
Looking at just the predator species ‘poor cod’. There are three LOBF (line of best fit), weighted by different variables:
When weighted by prey biomass, the gradient is equal to zero for 2 s.f.. Hence, the approximation (of slope$ = 0$) is supported by the data when looking at a prey biomass weighting.
However, given that this slope is only equal to \(0\) for one of these three weightings, the above linear models in fact show that the \(\log(\text{predator mass})\) and the \(\log(\text{PPMR})\) are independent of one another.
We are now proceeding with the idea that the \(\log(\text{predator mass})\) and \(\log(\text{PPMR})\) are independent of one another, hence the predator mass is independent of the PPMR.
Heteroscedasticity is when the residuals (error terms) are unequally scattered. For example, residuals may get more ‘spread out’ as the fitted values get larger. This occurs in data sets where there is a large range of data values, and is a problem because we want residuals with a constant variance. This is seen when the residuals are approximately normally distributed, and hence if residuals do not show a normal distribution then it is unclear whether or not the method used to infer the parameters of the linear relationship is a good one.
herring_df <- renamed_df %>% filter(pred_species == fixed("Herring"))
herring_model <- lm(lppmr ~ lpred_weight, data=herring_df)
res <- resid(herring_model)
predicted <- predict(herring_model)
#lm() fits a regression models for log(ppmr) against log(predator mass)
#resid() creates a list of the residual of each individual data point
#predict() gives the predicted value of a data point by using the previous behaviour and patterns of the data to determine possible future values
ggplot(herring_df, aes(x = lpred_weight, y = lppmr)) +
geom_smooth(method = "lm", se = FALSE, colour="black") +
geom_segment(aes(xend = lpred_weight, yend = predicted), colour="red", size=0.1) +
geom_point(size=0.5) +
labs(title='The log(predator mass) against log(ppmr) graph,
with lines from the individual points to the line of best fit: Herring',
x='log(predator mass)', y='log(ppmr)')
#showing the residuals of each point to the line of best fit
#geom_smooth() creates the regression line of the regression model from lm()
#geom_segment() draws a line from each point to the regressionline
This plot shows how distributed the points are in comparison to the line of best fit. For a ‘well fitting’ line of best fit, we would expect the residuals to be approximately normally distributed. As discussed earlier, the residual of a point is equal to: \[ \text{Residual} = \text{actual value} - \text{predicted/fitted value} , \] where the predicted/fitted value is equal to the value as predicted by the line of best fit.
ggplot(herring_model, aes(x = .fitted, y = .resid)) +
geom_point(colour="red", size=0.5) +
geom_hline(yintercept = 0, colour="grey") +
labs(title='Plot of the residuals against the fitted values: Herring',
x='Fitted Values', y='Residuals') +
geom_smooth() +
theme_classic()
#this plots the residuals of each data point, and adds a line across at res=0 to help visualise the data
#.resid creates a list of the residuals of each individual data point
This graph plots the residual of each observation against its fitted value (as predicted by the line of best fit). This plot is used to detect unequal residual variances and any outliers. We can see that the residuals generally increase as the fitted values increase, which suggests that the residuals are potentially not normally distributed. To look further into this possibility, we will next plot a qqplot.
qqnorm(res)
qqline(res)
#plot a Q-Q plot to determine if the residuals follow a normal distribution
#for a normal distribution, we would expect the data points to fall along roughly a straight line at a 45 degree angle
A qqplot (quantile-quantile plot) shows if two data sets come from the same distribution. For this data set, we plot the actual residuals against the expected residuals if the residuals were normally distributed. The points on this graph should fall on the straight line (at a 45 degree angle), and if they don’t then the residuals do not appear to be normally distributed.
ggplot(herring_model, aes(x = .resid)) +
geom_density(colour="red") +
labs(title='Density plot of the residuals: Herring', x='Residuals', y='Density')
#plots the density of the residuals of the lppmr v lpred_weight graph for Herring
We can see that for the predator species Herring, the residuals do not look like they are normally distributed. This suggests that the residuals are unequally scattered, and hence we continue with the idea that \(\log{\text{ppmr}}\) and \(\log{\text{predator species}}\) are independent of each other.
model<- lm(lppmr~lpred_weight, data=renamed_df)
ggplot(model, aes(x = .resid)) +
geom_density(colour="red") +
facet_wrap(~renamed_df$pred_species, scales = "free_y") +
labs(title="Density plot of the residuals", x='Residuals', y='Density')
#Plotting the density of the residuals for each predator species
This is a histogram plot of the residuals of the plot, separated for each predator species. We are looking for residuals which are normally distributed around resid=0.
Of the resulting histograms, the following show (approximately) a normal distribution:
Therefore, it could be said that a linear model (\(y=mx+c\)) is applicable for comparing the \(\log(\text{predator mass})\) and \(\log(\text{ppmr})\). However, since not all the residuals are strongly normally distributed, we will instead use a different model for this data set to attempt to more accurately approximate certain parameters in the variable relationships.
We will introduce a mixed effects model to account for some variance that external effects may cause to the data.
This is mathematically represented by: \[ \log(\text{ppmr})_i = \beta_1 \times \log(\text{predator mass})_i + \beta_0 \]
Where:
renamed_df %>%
ggplot(aes(x=lpred_weight, y=lppmr, group=pred_species, color=pred_species)) +
geom_line(size=1) +
labs( x='log(predator mass)', y='log(ppmr)') +
theme_classic()
#theme_classic() removes the grey background of the plot to make it easier to read
This plot is very difficult to interpret, and the relationship between the \(\log(\text{ppmr})\) and \(\log(\text{predator mass})\) is difficult to visualise as there is not a single gradient for each predator species. To aid in our interpretation, we will add random effects to account for some of the variance in the \(\log(\text{ppmr})\) values.
Fixed effects: log(predator mass)
Random effects: predator species (fixed slopes and random intercepts)
one <- lmer(lppmr ~ lpred_weight + (1|pred_species), data = renamed_df, REML=FALSE)
#same slope for every predator species, but varying intercept
renamed_df %>%
mutate(lppmr= fitted(one)) %>%
ggplot(aes(x=lpred_weight, y=lppmr, group=pred_species, color=pred_species)) +
theme_classic() +
labs( x='log(predator mass)', y='log(ppmr)') +
geom_line(size=1)
#fitted() gives the fitted values, which are the predicted values of the outcome variable (lppmr) for the model, found using the random distributions of the random variables for the slope and intercept
Here, there is the same slope for every predator species, but each species has a different intercept.
This is mathematically represented by:
\[ \log(\text{ppmr})_{ij} = \beta_{0} + \alpha_{0j} + \beta_{1} \log(\text{predator mass})_i + \epsilon_{ij} \]
Where:
Fixed effects: log(predator mass)
Random effects: predator species (random slope and fixed intercept)
two <- lmer(lppmr ~ lpred_weight + (0 + lpred_weight|pred_species), data = renamed_df, REML=FALSE)
#(0+ | ) means that the slope of the graph is different for each predator species
renamed_df %>%
mutate(lppmr = fitted(two)) %>%
ggplot(aes(x=lpred_weight, y=lppmr, group=pred_species, color=pred_species)) +
theme_classic() +
labs( x='log(predator mass)', y='log(ppmr)') +
geom_line(size=1)
This is mathematically represented by:
\[ \log(\text{ppmr})_{ij} = \beta_{0} + (\beta_{1} + \alpha_{1j}) \log(\text{predator mass})_i + \epsilon_{ij} \]
Where:
For this plot, the value of log(predator mass) is modeled to be dependent on the predator species.
Fixed effects: log(predator mass)
Random effects: predator species (random slope and random intercept)
three <- lmer(lppmr ~ lpred_weight + (1+lpred_weight|pred_species), data = renamed_df, REML=FALSE)
#Random intercept and random slope (correlated)
renamed_df %>%
mutate(lppmr = fitted(three)) %>%
ggplot(aes(x=lpred_weight, y=lppmr, group=pred_species, color=pred_species)) +
theme_classic() +
labs(x='log(predator mass)', y='log(ppmr)') +
geom_line(size=1)
This is mathematically represented by: \[ \log(\text{ppmr})_{ij} = \beta_{0} + \alpha_{0j} + (\beta_{1} + \alpha_{1j}) \log(\text{predator mass})_i + \epsilon_{ij} \]
Where:
anova(one,two,three)
## Data: renamed_df
## Models:
## one: lppmr ~ lpred_weight + (1 | pred_species)
## two: lppmr ~ lpred_weight + (0 + lpred_weight | pred_species)
## three: lppmr ~ lpred_weight + (1 + lpred_weight | pred_species)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## one 4 1108372 1108414 -554182 1108364
## two 4 1110844 1110886 -555418 1110836 0.0 0
## three 6 1106111 1106173 -553049 1106099 4737.5 2 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#comparing the three models
Here, all the random effects are modeled with a randomly distributed slope and intercept.
This is mathematically represented by:
\[ \log(\text{ppmr})_{ih} = \beta_{0} + H_{0h} + (\beta_{1} + H_{1h}) \log(\text{predator mass})_i + \epsilon_{ih} \]
Where:
And:
a <- lmer(lppmr ~ lpred_weight + (1+lpred_weight|haul_id_short), data = herring_df, REML=FALSE)
summary(a)
## Linear mixed model fit by maximum likelihood ['lmerMod']
## Formula: lppmr ~ lpred_weight + (1 + lpred_weight | haul_id_short)
## Data: herring_df
##
## AIC BIC logLik deviance df.resid
## 13141.8 13177.6 -6564.9 13129.8 2907
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.1997 -0.4304 0.0223 0.4733 3.5513
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## haul_id_short (Intercept) 10.34399 3.2162
## lpred_weight 0.05564 0.2359 -1.00
## Residual 5.21371 2.2834
## Number of obs: 2913, groups: haul_id_short, 13
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 6.18076 0.95750 6.455
## lpred_weight 0.55528 0.08691 6.389
##
## Correlation of Fixed Effects:
## (Intr)
## lpred_weght -0.894
## optimizer (nloptwrap) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')
herring_df %>%
mutate(lppmr= fitted(a)) %>%
ggplot(aes(x=lpred_weight, y=lppmr)) +
labs(title='log(PPMR) v. log(predator mass) plot for model a: Herring',
x='log(predator mass)', y='log(ppmr)') +
geom_point(size=0.001, colour="blue")
herring_df %>%
ggplot(aes(x = resid(a))) +
geom_density(colour="blue") +
labs(title='Density plot of the residuals for model a: Herring', x='Residuals', y='Density')
#resid(a) gives the residuals of model a
#The residuals are found by subtracting the fitted
Mathematical representation \[ \log(\text{ppmr})_{ihy} = \beta_{0} + H_{0h} + Y_{0y} + (\beta_{1} + H_{1h} + Y_{1y}) \log(\text{predator mass})_i + \epsilon_{ihyr} \]
Where:
And:
b <- lmer(lppmr ~ lpred_weight + (1+lpred_weight|haul_id_short) + (1+lpred_weight|year),
data = herring_df, REML=FALSE)
summary(b)
## Linear mixed model fit by maximum likelihood ['lmerMod']
## Formula: lppmr ~ lpred_weight + (1 + lpred_weight | haul_id_short) + (1 +
## lpred_weight | year)
## Data: herring_df
##
## AIC BIC logLik deviance df.resid
## 13103.2 13157.0 -6542.6 13085.2 2904
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.5000 -0.4584 -0.0002 0.5399 3.5830
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## year (Intercept) 0.386395 0.62161
## lpred_weight 0.009257 0.09621 -0.47
## haul_id_short (Intercept) 7.653812 2.76655
## lpred_weight 0.026492 0.16276 -1.00
## Residual 5.108200 2.26013
## Number of obs: 2913, groups: year, 22; haul_id_short, 13
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 5.80577 0.85977 6.753
## lpred_weight 0.66658 0.08799 7.576
##
## Correlation of Fixed Effects:
## (Intr)
## lpred_weght -0.747
## optimizer (nloptwrap) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')
herring_df %>%
mutate(lppmr= fitted(b)) %>%
ggplot(aes(x=lpred_weight, y=lppmr)) +
labs(title='log(PPMR) v. log(predator mass) plot for model b: Herring',
x='log(predator mass)', y='log(ppmr)') +
geom_point(size=0.001, colour="red")
herring_df %>%
ggplot(aes(x = resid(b))) +
geom_density(colour="red") +
labs(title='Density plot of the residuals for model b: Herring', x='Residuals', y='Density')
Mathematical representation: \[ \log(\text{ppmr})_{ihyr} = \beta_{0} + H_{0h} + Y_{0y} + R_{0r} + (\beta_{1} + H_{1h} + Y_{1y} + R_{1r}) \log(\text{predator mass})_i + \epsilon_{ihyr} \]
Where:
And:
c <- lmer(lppmr ~ lpred_weight + (1+lpred_weight|haul_id_short) + (1+lpred_weight|year) +
(1+lpred_weight|ices_rectangle), data = herring_df, REML=FALSE)
summary(c)
## Linear mixed model fit by maximum likelihood ['lmerMod']
## Formula: lppmr ~ lpred_weight + (1 + lpred_weight | haul_id_short) + (1 +
## lpred_weight | year) + (1 + lpred_weight | ices_rectangle)
## Data: herring_df
##
## AIC BIC logLik deviance df.resid
## 13035.0 13106.8 -6505.5 13011.0 2901
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.2280 -0.4625 0.0352 0.5052 3.8013
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## ices_rectangle (Intercept) 2.531025 1.59092
## lpred_weight 0.034443 0.18559 -1.00
## year (Intercept) 0.086118 0.29346
## lpred_weight 0.010707 0.10347 1.00
## haul_id_short (Intercept) 5.751788 2.39829
## lpred_weight 0.007871 0.08872 -1.00
## Residual 4.828733 2.19744
## Number of obs: 2913, groups: ices_rectangle, 152; year, 22; haul_id_short, 13
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 5.67779 0.82585 6.875
## lpred_weight 0.64541 0.09742 6.625
##
## Correlation of Fixed Effects:
## (Intr)
## lpred_weght -0.624
## optimizer (nloptwrap) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')
herring_df %>%
mutate(lppmr= fitted(c)) %>%
ggplot(aes(x=lpred_weight, y=lppmr)) +
labs(title='log(PPMR) v. log(predator mass) plot for model c: Herring',
x='log(predator mass)', y='log(ppmr)') +
geom_point(size=0.001, colour="green")
herring_df %>%
ggplot(aes(x = resid(c))) +
geom_density(colour="green") +
labs(title='Density plot of the residuals for model c: Herring', x='Residuals', y='Density')
Variance of the groups decreases as other random effects are added to the models. But, the error term (standard error) roughly increases as more random effects are added to the model.
Model c appears to represent the data the most effectively. We now want to consider a mixed effects model over all the predator species (not just Herring). Therefore, we choose to continue with model c (with random effects: haul_id_short, year, ices_rectangle), but we also introduce a fourth random effect of pred_species.
Mathematical representation \[ \log(\text{ppmr})_{ihyr} = \beta_{0} + H_{0h} + Y_{0y} + R_{0r} + (\beta_{1} + H_{1h} + Y_{1y} + R_{1r}) \log(\text{predator mass})_i + \epsilon_{ihyr} \] Where:
(where each random intercept term is normally distributed with a mean of \(0\) and a standard deviation \(\omega_{[00]}\) (i.e. \(sim (0, \omega^2_{[00]})\))).
And:
all <- lmer(lppmr ~ lpred_weight + (1+lpred_weight|haul_id_short) + (1+lpred_weight|year) +
(1+lpred_weight|ices_rectangle) + (1+lpred_weight|pred_species),
data = renamed_df, REML=FALSE)
summary(all)
## Linear mixed model fit by maximum likelihood ['lmerMod']
## Formula: lppmr ~ lpred_weight + (1 + lpred_weight | haul_id_short) + (1 +
## lpred_weight | year) + (1 + lpred_weight | ices_rectangle) +
## (1 + lpred_weight | pred_species)
## Data: renamed_df
##
## AIC BIC logLik deviance df.resid
## 1050649.5 1050806.2 -525309.8 1050619.5 254580
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -5.7029 -0.6066 -0.0387 0.5777 6.7472
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## ices_rectangle (Intercept) 0.74808 0.8649
## lpred_weight 0.02243 0.1498 -0.77
## year (Intercept) 1.01486 1.0074
## lpred_weight 0.01590 0.1261 -0.84
## haul_id_short (Intercept) 2.79267 1.6711
## lpred_weight 0.04027 0.2007 -0.47
## pred_species (Intercept) 0.60683 0.7790
## lpred_weight 0.02887 0.1699 -0.29
## Residual 3.58532 1.8935
## Number of obs: 254595, groups:
## ices_rectangle, 718; year, 97; haul_id_short, 97; pred_species, 16
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 5.12662 0.31570 16.239
## lpred_weight 0.28973 0.05613 5.161
##
## Correlation of Fixed Effects:
## (Intr)
## lpred_weght -0.489
## optimizer (nloptwrap) convergence code: 0 (OK)
## Model failed to converge with max|grad| = 0.0300338 (tol = 0.002, component 1)
renamed_df %>%
mutate(lppmr= fitted(all)) %>%
ggplot(aes(x=lpred_weight, y=lppmr)) +
labs(title='log(PPMR) v. log(predator mass) plot',
x='log(predator mass)', y='log(ppmr)') +
geom_point(size=0.001, colour="black") +
theme_classic()
renamed_df %>%
ggplot(aes(x = resid(all))) +
geom_density(colour="black") +
labs(title='Density plot of the residuals', x='Residuals', y='Density')
This model has the following variables:
| Variable name | Meaning | Value |
|---|---|---|
| \(\beta_{0}\) | the intercept term | 5.12662 |
| \(\beta_{1}\) | the fixed effect (slope) for \(\log(\text{predator mass})\) | 0.28973 |
| \(\omega_{H[00]}\) | s.d. for the intercept term \(H_{0h}\), dependent on the haul_id_short | 1.6711 |
| \(\omega_{H[10]}\) | s.d. for the slope term \(H_{1h}\), dependent on the haul_id_short | 0.2007 |
| \(\omega_{Y[00]}\) | s.d. for the intercept term \(Y_{0y}\), dependent on the year | 1.0074 |
| \(\omega_{Y[10]}\) | s.d. for the slope term \(Y_{1y}\), dependent on the year | 0.1261 |
| \(\omega_{R[00]}\) | s.d. for the intercept term \(R_{0r}\), dependent on the ices_rectangle | 0.8649 |
| \(\omega_{R[10]}\) | s.d. for the slope term \(R_{1r}\), dependent on the ices_rectangle | 0.1498 |
| \(\omega_{P[00]}\) | s.d. for the intercept term \(P_{0p}\), dependent on the pred_species | 0.7790 |
| \(\omega_{P[10]}\) | s.d. for the slope term \(P_{1p}\), dependent on the pred_species | 0.1699 |
| \(\epsilon_{ihyr}\) | the s.d. of the residual/error term | 1.8935 |
#citation()
#devtools::session_info()